The goals for this analysis are to investigate how the gene expression changes based on the mouse model Questions to answer: -What genes are differentially expressed due to Mouse Model Alone?
library(DESeq2)
library(pheatmap)
library(dplyr)
library(dendextend)
design_matrix<-read.table('/Users/danielaquijano/Documents/GitHub/Transcriptomics-Final-Project-/source_files/Experimental_Design.csv',sep=',',header=TRUE)
rownames(design_matrix)<-design_matrix$Sample
design_matrix$Sample<-NULL
design_matrix
Import matrix containing the counts for all samples. These counts represent the forward strand counts since this was a stranded library.
counts_matrix<-read.table("/Users/danielaquijano/Documents/GitHub/Transcriptomics-Final-Project-/Count_Tables/allcounts.csv",sep=',',header=TRUE)
Because the numbers after the dot in the ensembl IDs represent versions of genes in certain annotations, we can remove these to more easily conduct our differential gene expression analysis.
counts_matrix$V1<-gsub("\\..*","",counts_matrix$V1)
counts_matrix
rownames(counts_matrix)<-counts_matrix$V1
counts_matrix$V1<-NULL
We need to sort the counts_matrix as well as our experimental design matrix in order to run DESEQ2
counts_matrix<-counts_matrix[,order(colnames(counts_matrix))]
counts_matrix
design_matrix<-design_matrix[order(rownames(design_matrix)),]
design_matrix
dds <- DESeqDataSetFromMatrix(countData = counts_matrix,
colData = design_matrix,
design = ~ Model)
Warning in DESeqDataSet(se, design = design, ignoreRank) :
some variables in design formula are characters, converting to factors
dds
class: DESeqDataSet
dim: 46075 72
metadata(1): version
assays(1): counts
rownames(46075): ENSMUSG00000000001 ENSMUSG00000000003 ... N_noFeature N_unmapped
rowData names(0):
colnames(72): SRR8512301 SRR8512302 ... SRR8512439 SRR8512440
colData names(3): Model Genotype Age
Remove genes with counts less than 10
keep <- rowSums(counts(dds)) >= 10
dds <- dds[keep,]
Run DESeq
dds <- DESeq(dds)
estimating size factors
estimating dispersions
gene-wise dispersion estimates
mean-dispersion relationship
final dispersion estimates
fitting model and testing
-- replacing outliers and refitting for 101 genes
-- DESeq argument 'minReplicatesForReplace' = 7
-- original counts are preserved in counts(dds)
estimating dispersions
fitting model and testing
rawcounts.matrix <- counts(dds,normalized=F)
normalizedcounts.matrix <- counts(dds,normalized=T)
vst_dds <- vst(dds)
dists <- dist(t(assay(vst_dds)))
Trying to rename labels….
dend <- dists %>% hclust %>% as.dendrogram
dend %>% set("labels", c(rep('rtg4510',26),rep('J20',42),rep('rtg4510',4))) %>% set("labels_cex", 0.5) %>% plot
`
PCA_Genotype<-plotPCA(vst_dds,intgroup=c("Genotype"))+labs(title = "PCA of mice of different genotypes", color = "Group")+coord_fixed(ratio=3)
Coordinate system already present. Adding new coordinate system, which will replace the existing one.
PCA_Genotype
res_1 <- results(dds, contrast = c("Model","rtg4510","J20"))
res1_ordered <- res_1[order(res_1$pvalue),]
head(res1_ordered,200)
log2 fold change (MLE): Model rtg4510 vs J20
Wald test p-value: Model rtg4510 vs J20
DataFrame with 200 rows and 6 columns
baseMean log2FoldChange lfcSE stat pvalue padj
<numeric> <numeric> <numeric> <numeric> <numeric> <numeric>
ENSMUSG00000002109 170.3439 2.46430 0.0522068 47.2027 0 0
ENSMUSG00000003779 56.5614 4.62527 0.1077837 42.9126 0 0
ENSMUSG00000003812 313.0862 3.25292 0.0615835 52.8212 0 0
ENSMUSG00000005150 280.8934 1.39112 0.0325297 42.7648 0 0
ENSMUSG00000005677 328.8963 8.05015 0.1384032 58.1644 0 0
... ... ... ... ... ... ...
ENSMUSG00000087623 138.7509 7.79529 0.1872103 41.6392 0 0
ENSMUSG00000087644 3067.6482 7.06563 0.0834706 84.6481 0 0
ENSMUSG00000089671 110.8091 7.16815 0.1764113 40.6332 0 0
ENSMUSG00000089875 63.7449 4.23792 0.0943981 44.8941 0 0
ENSMUSG00000089990 89.9363 6.19206 0.1608637 38.4926 0 0
nrow(res1_ordered)
[1] 25644
Filter the res1_ordered byp value to obtain only those genes with p value less than 0.05
#Filter Differentially Expressed Genes by p value 0.05
res1_ordered_filtered <- res1_ordered[res1_ordered$pvalue<0.05,]
See how many genes have p value equals to zero
res1_ordered_filtered_2 <- res1_ordered[res1_ordered$pvalue==0,]
Genes with pvalue==0
res1_ordered_filtered_2
log2 fold change (MLE): Model rtg4510 vs J20
Wald test p-value: Model rtg4510 vs J20
DataFrame with 276 rows and 6 columns
baseMean log2FoldChange lfcSE stat pvalue padj
<numeric> <numeric> <numeric> <numeric> <numeric> <numeric>
ENSMUSG00000002109 170.3439 2.46430 0.0522068 47.2027 0 0
ENSMUSG00000003779 56.5614 4.62527 0.1077837 42.9126 0 0
ENSMUSG00000003812 313.0862 3.25292 0.0615835 52.8212 0 0
ENSMUSG00000005150 280.8934 1.39112 0.0325297 42.7648 0 0
ENSMUSG00000005677 328.8963 8.05015 0.1384032 58.1644 0 0
... ... ... ... ... ... ...
ENSMUSG00000105255 2.58131e+02 8.78867 0.2049789 42.8760 0 0
ENSMUSG00000105868 1.74106e+02 8.17112 0.1901126 42.9804 0 0
ENSMUSG00000106106 6.21895e+03 9.45338 0.2339586 40.4062 0 0
ENSMUSG00000107191 7.00007e+01 5.99563 0.1513276 39.6202 0 0
N_noFeature 6.51300e+06 3.40452 0.0298621 114.0082 0 0
Genes with pvalue<0.05
res1_ordered_filtered
log2 fold change (MLE): Model rtg4510 vs J20
Wald test p-value: Model rtg4510 vs J20
DataFrame with 16041 rows and 6 columns
baseMean log2FoldChange lfcSE stat pvalue padj
<numeric> <numeric> <numeric> <numeric> <numeric> <numeric>
ENSMUSG00000002109 170.3439 2.46430 0.0522068 47.2027 0 0
ENSMUSG00000003779 56.5614 4.62527 0.1077837 42.9126 0 0
ENSMUSG00000003812 313.0862 3.25292 0.0615835 52.8212 0 0
ENSMUSG00000005150 280.8934 1.39112 0.0325297 42.7648 0 0
ENSMUSG00000005677 328.8963 8.05015 0.1384032 58.1644 0 0
... ... ... ... ... ... ...
ENSMUSG00000071531 7.44337 -0.3215877 0.1638888 -1.96223 0.0497356 0.0795236
ENSMUSG00000040258 53.95094 -0.2108794 0.1074991 -1.96169 0.0497991 0.0796201
ENSMUSG00000026768 95.60908 0.2181978 0.1112373 1.96155 0.0498146 0.0796383
ENSMUSG00000104212 2.63321 0.4377779 0.2231813 1.96153 0.0498167 0.0796383
ENSMUSG00000014763 882.42615 0.0590712 0.0301286 1.96063 0.0499218 0.0798015
#GO-Term Enrichment
Install Mouse annotation library:
BiocManager::install("org.Mm.eg.db")
install.packages("devtools")
devtools::install_github("stephenturner/annotables")
install.packages("ggnewscale")
library(biomaRt) #For conversion of transcript IDs to gene ID
library(annotables) #to retrieve grcm38 annotation for mouse genome
library(org.Mm.eg.db) #Mouse genome annotation
library(DOSE)
library(pathview)
library(clusterProfiler)
library(AnnotationHub)
library(ensembldb)
library(tidyverse)
library(ggnewscale)
Because we have transcript IDs, we want ensembl ids in order to be able to conduct GO Term enrichment we need ensembl ID
In order to conduct the hypergeometric test on each set of differentially expressed genes we need to have two sets of genes: a background set, and a significant differentially expressed gene set. The background set will be comprised of all differentially expressed genes and the genes of interest will be those with significant p values (0.05).
We are using ‘ALL’ for ontology because we want to see all of the differentially expressed genes accross all categories.
idx <- grcm38$ensgene %in% rownames(res1_ordered)
ids <- grcm38[idx, ]
non_duplicates <- which(duplicated(ids$ensgene) == FALSE)
ids <- ids[non_duplicates, ]
ids
## Use mouse genome
ego <- enrichGO(gene = rownames(res1_ordered_filtered),
universe =ids$ensgene ,
keyType = "ENSEMBL",
OrgDb = org.Mm.eg.db,
ont = "ALL",
readable = TRUE,
pAdjustMethod="none",
pool=TRUE)
## Output results from GO analysis to a table
enriched_genes_res1 <- data.frame(ego)
enriched_genes_res1
dotplot(ego, showCategory=50)+theme(text = element_text(size = 1)) +scale_y_discrete(labels=function(x) str_wrap(x, width=40))+ggtitle('Enriched genes when comparing rtg4510 and J20 mice')+ theme(plot.title = element_text(size=16))
Scale for 'y' is already present. Adding another scale for 'y', which will replace the
existing scale.
#SPIA
# mouse genome load
grcm38
# check that ensgene in our df is prsent in the mouse genome df
idx <- grcm38$ensgene %in% rownames(res1_ordered)
head(idx)
[1] TRUE FALSE TRUE TRUE TRUE TRUE
# df with all the ids that are in our df from the mouse genome df
ids <- grcm38[idx, ]
head(ids)
# remove duplicates
non_duplicates <- which(duplicated(ids$ensgene) == FALSE)
ids <- ids[non_duplicates, ]
# entrezID contains only the IDs that are also in our df
entrezID= grcm38[grcm38$ensgene %in% rownames(res1_ordered), ]
# check nrow entrezID
head(entrezID)
# create a vector of only the entrezIDs
entrez_ID_vector = c(entrezID[[2]])
head(entrez_ID_vector)
[1] 14679 12544 NA 107815 11818 67608
# create new df that contains only the entrezID, lfc and padj
res2= data.frame(log2foldchange= subset(res1_ordered$log2FoldChange, grcm38$ensgene %in% rownames(res1_ordered)))
padj = subset(res1_ordered$padj, grcm38$ensgene %in% rownames(res1_ordered))
res2 = cbind(padj, res2)
res2 = cbind(entrez_ID_vector, res2)
# omit all "na" values
res2 = na.omit(res2)
head(res2)
## Significant genes is a vector of fold changes where the names are ENTREZ gene IDs. The background set is a vector of all the genes represented on the platform.
# bg entrez contains all the entrezIDs
background_entrez <- res2$entrez_ID_vector
# sig res entrez contains all the entrezIDs that have padj <0.05
sig_res_entrez <- res2[which(res2$padj < 0.05), ]
# vector of only lfc values
sig_entrez <- sig_res_entrez$log2foldchange
head(sig_res_entrez)
# adding entrezIDs as names for the sig entrez
names(sig_entrez) <- sig_res_entrez$entrez
head(sig_entrez)
# remove dups
dups<-unique(names(sig_entrez[which(duplicated(names(sig_entrez)))]))
sig_entrez<-sig_entrez[!(names(sig_entrez) %in% dups)]
#de= as.vector(sig_entrez)
#de = sort(de, decreasing = FALSE)
# this step takes time
spia_result <- spia(de=sig_entrez, all=background_entrez, organism="mmu")
Done pathway 1 : RNA transport..
Done pathway 2 : RNA degradation..
Done pathway 3 : PPAR signaling pathway..
Done pathway 4 : Fanconi anemia pathway..
Done pathway 5 : MAPK signaling pathway..
Done pathway 6 : ErbB signaling pathway..
Done pathway 7 : Calcium signaling pathway..
Done pathway 8 : Cytokine-cytokine receptor int..
Done pathway 9 : Chemokine signaling pathway..
Done pathway 10 : NF-kappa B signaling pathway..
Done pathway 11 : Phosphatidylinositol signaling..
Done pathway 12 : Neuroactive ligand-receptor in..
Done pathway 13 : Cell cycle..
Done pathway 14 : Oocyte meiosis..
Done pathway 15 : p53 signaling pathway..
Done pathway 16 : Sulfur relay system..
Done pathway 17 : SNARE interactions in vesicula..
Done pathway 18 : Regulation of autophagy..
Done pathway 19 : Protein processing in endoplas..
Done pathway 20 : Lysosome..
Done pathway 21 : mTOR signaling pathway..
Done pathway 22 : Apoptosis..
Done pathway 23 : Vascular smooth muscle contrac..
Done pathway 24 : Wnt signaling pathway..
Done pathway 25 : Dorso-ventral axis formation..
Done pathway 26 : Notch signaling pathway..
Done pathway 27 : Hedgehog signaling pathway..
Done pathway 28 : TGF-beta signaling pathway..
Done pathway 29 : Axon guidance..
Done pathway 30 : VEGF signaling pathway..
Done pathway 31 : Osteoclast differentiation..
Done pathway 32 : Focal adhesion..
Done pathway 33 : ECM-receptor interaction..
Done pathway 34 : Cell adhesion molecules (CAMs)..
Done pathway 35 : Adherens junction..
Done pathway 36 : Tight junction..
Done pathway 37 : Gap junction..
Done pathway 38 : Complement and coagulation cas..
Done pathway 39 : Antigen processing and present..
Done pathway 40 : Toll-like receptor signaling p..
Done pathway 41 : NOD-like receptor signaling pa..
Done pathway 42 : RIG-I-like receptor signaling ..
Done pathway 43 : Cytosolic DNA-sensing pathway..
Done pathway 44 : Jak-STAT signaling pathway..
Done pathway 45 : Natural killer cell mediated c..
Done pathway 46 : T cell receptor signaling path..
Done pathway 47 : B cell receptor signaling path..
Done pathway 48 : Fc epsilon RI signaling pathwa..
Done pathway 49 : Fc gamma R-mediated phagocytos..
Done pathway 50 : Leukocyte transendothelial mig..
Done pathway 51 : Intestinal immune network for ..
Done pathway 52 : Circadian rhythm - mammal..
Done pathway 53 : Long-term potentiation..
Done pathway 54 : Neurotrophin signaling pathway..
Done pathway 55 : Retrograde endocannabinoid sig..
Done pathway 56 : Glutamatergic synapse..
Done pathway 57 : Cholinergic synapse..
Done pathway 58 : Serotonergic synapse..
Done pathway 59 : GABAergic synapse..
Done pathway 60 : Dopaminergic synapse..
Done pathway 61 : Long-term depression..
Done pathway 62 : Olfactory transduction..
Done pathway 63 : Taste transduction..
Done pathway 64 : Phototransduction..
Done pathway 65 : Regulation of actin cytoskelet..
Done pathway 66 : Insulin signaling pathway..
Done pathway 67 : GnRH signaling pathway..
Done pathway 68 : Progesterone-mediated oocyte m..
Done pathway 69 : Melanogenesis..
Done pathway 70 : Adipocytokine signaling pathwa..
Done pathway 71 : Type II diabetes mellitus..
Done pathway 72 : Type I diabetes mellitus..
Done pathway 73 : Maturity onset diabetes of the..
Done pathway 74 : Aldosterone-regulated sodium r..
Done pathway 75 : Endocrine and other factor-reg..
Done pathway 76 : Vasopressin-regulated water re..
Done pathway 77 : Salivary secretion..
Done pathway 78 : Gastric acid secretion..
Done pathway 79 : Pancreatic secretion..
Done pathway 80 : Carbohydrate digestion and abs..
Done pathway 81 : Bile secretion..
Done pathway 82 : Mineral absorption..
Done pathway 83 : Alzheimer's disease..
Done pathway 84 : Parkinson's disease..
Done pathway 85 : Amyotrophic lateral sclerosis ..
Done pathway 86 : Huntington's disease..
Done pathway 87 : Prion diseases..
Done pathway 88 : Cocaine addiction..
Done pathway 89 : Amphetamine addiction..
Done pathway 90 : Morphine addiction..
Done pathway 91 : Alcoholism..
Done pathway 92 : Bacterial invasion of epitheli..
Done pathway 93 : Salmonella infection..
Done pathway 94 : Pertussis..
Done pathway 95 : Legionellosis..
Done pathway 96 : Leishmaniasis..
Done pathway 97 : Chagas disease (American trypa..
Done pathway 98 : African trypanosomiasis..
Done pathway 99 : Malaria..
Done pathway 100 : Toxoplasmosis..
Done pathway 101 : Amoebiasis..
Done pathway 102 : Staphylococcus aureus infectio..
Done pathway 103 : Tuberculosis..
Done pathway 104 : Hepatitis C..
Done pathway 105 : Measles..
Done pathway 106 : Influenza A..
Done pathway 107 : HTLV-I infection..
Done pathway 108 : Herpes simplex infection..
Done pathway 109 : Epstein-Barr virus infection..
Done pathway 110 : Pathways in cancer..
Done pathway 111 : Transcriptional misregulation ..
Done pathway 112 : Viral carcinogenesis..
Done pathway 113 : Colorectal cancer..
Done pathway 114 : Renal cell carcinoma..
Done pathway 115 : Pancreatic cancer..
Done pathway 116 : Endometrial cancer..
Done pathway 117 : Glioma..
Done pathway 118 : Prostate cancer..
Done pathway 119 : Thyroid cancer..
Done pathway 120 : Basal cell carcinoma..
Done pathway 121 : Melanoma..
Done pathway 122 : Bladder cancer..
Done pathway 123 : Chronic myeloid leukemia..
Done pathway 124 : Acute myeloid leukemia..
Done pathway 125 : Small cell lung cancer..
Done pathway 126 : Non-small cell lung cancer..
Done pathway 127 : Asthma..
Done pathway 128 : Autoimmune thyroid disease..
Done pathway 129 : Systemic lupus erythematosus..
Done pathway 130 : Rheumatoid arthritis..
Done pathway 131 : Allograft rejection..
Done pathway 132 : Graft-versus-host disease..
Done pathway 133 : Arrhythmogenic right ventricul..
Done pathway 134 : Dilated cardiomyopathy..
Done pathway 135 : Viral myocarditis..
Warning: stack imbalance in '<-', 2 then 3
plotP(spia_result, threshold=0.05)
Error in if (sum(oky) > 0) { : missing value where TRUE/FALSE needed
write.csv(spia_result, file = '/Users/danielaquijano/Documents/GitHub/Transcriptomics-Final-Project-/source_files/SPIA_J20_vs_TG.csv')
spia_result
p1 <- cnetplot(ego, categorySize="pvalue", showCategory = 4,node_label="category",cex_label_gene = 0.5,cex_label_category=0.9,shadowtext='category',color_category='pink', color_gene='purple' )+ggtitle('Enriched Genes when comparing J20 and rtg4510 mice')
p1
p2 <- cnetplot(ego, categorySize="pvalue", showCategory = 1,node_label="all",cex_label_gene = 0.5,cex_label_category=0.9,shadowtext='category',color_category='pink', color_gene='darkmagenta' )+ggtitle('Enriched Genes when comparing J20 and rtg4510 mice')
p2
Warning: ggrepel: 60 unlabeled data points (too many overlaps). Consider increasing max.overlaps
norm_counts_top_40<-normalizedcounts.matrix[rownames(head(res1_ordered_filtered,40)),]
nrow(norm_counts_top_40)
[1] 40
head(design_matrix)
annotation_columns<-design_matrix
row.names(annotation_columns) <- colnames(norm_counts_top_40)
library(pheatmap)
tiff("Heatmap_J20_vs_rtg4510.tiff", width = 7, height = 5, units = 'in', res = 300)
pheatmap(norm_counts_top_40, color=colorRampPalette(c("white", "maroon1", "darkmagenta"))(30), scale="row", cluster_cols = T, show_rownames = T,fontsize = 7,fontsize_row = 4, fontsize_col = 4,labels_row = rownames(dists),annotation_col =annotation_columns,main='Differentially Expressed Genes in J20 and rtg4510 mice' )
dev.off()
null device
1